library(RMySQL)
library(tidyverse) #Data Manipulation and Plotting
library(lubridate) #Date Manipulation
library(arulesSequences) #Running the Sequence mining algorithm
library(ggtext) #Making adding some flair to plots
library(tidygraph) ## Creating a Graph Structure
library(ggraph) ## Plotting the Network Graph Structure
library(arules)
library(arulesViz)
library(arulesSequences)
library(redivis)
MIMIC-III documentation: https://mimic.mit.edu/docs/iii/
Association rule learning is a rule-based machine learning method for discovering interesting relations between variables in large databases. It is intended to identify strong rules discovered in databases. In any given transaction with a variety of items, association rules are meant to discover the rules that determine how or why certain items are connected.
Definition:
Transactions: \(D=\{t_1, t_2, ..., t_m\}\)
Items: \(I=\{i_1, i_2, ..., i_n\}\)
Rule: \(X \Rightarrow Y\), X and Y are both sets of items, also known as itemsets
X: antecedent or left-hand-side (LHS)
Y: consequent or right-hand-side (RHS)
This simply implies that, in theory, whenever X occurs in a dataset, then Y will occur as well.
Measures of interestingness:
Support: Support is an indication of how frequently the itemset appears in the dataset. The support of X with respect to T is defined as the proportion of transactions in the dataset which contains the itemset X.
\(\text{support of X} = P(X) = \frac{\text{# of transactions containg X}}{\text{total # of transactions}} = \frac{|\{(i, t)\in T:X \subseteq t\}|}{|T|}\)
Confidence: Confidence is the percentage of all transactions satisfying X that also satisfy Y. With respect to T, the confidence value of an association rule \(X \Rightarrow Y\), is the ratio of transactions containing both X and Y to the total amount of X values present, where X is the antecedent and Y is the consequent.
\(conf(X \Rightarrow Y) = P(Y|X) = \frac{supp(X \cup Y)}{supp(X)} = \frac{\text{# of transactions containg X and Y}}{\text{# of transactions containg X}}\)
Lift: Lift is defined as the ratio of the observed support to that expected if X and Y were independent. The value of lift indacates the degree to which the two itemsets are dependent on one another.
\(lift(X \Rightarrow Y) = \frac{supp(X \cup Y)}{supp(X) \times supp(Y)}\)
lift > 1: the degree to which those two occurrences are dependent on one another, and makes those rules potentially useful for predicting the consequent in future data sets.
lift < 1: the items are substitute to each other. This means that presence of one item has negative effect on presence of other item and vice versa.
By setting the thresholds of Support and Confidence, we can find out the association rules which can satisfy the specified minimum support and confidence at the same time.
Usually, the Association rule generation is split into two different steps that needs to be applied:
A minimum Support threshold to find all the frequent itemsets that are in the database.
A minimum Confidence threshold to the frequent itemsets found to create rules.
Notably, this simple algorithm can help find the shared associations of diseases between patients, but ignore the timing information of diagnosis.
Table used here: ADMISSIONS
Official documents: https://mimic.mit.edu/docs/iii/tables/admissions/
In this instance, data are extracted from Redivis database via a private api token and processed in the steps below:
delete patients who only have one admission record as “NEWBORN”;
delete patients who only have one diagnosis record;
reformat the dataset into a dataframe with column:(subject_id, diagnosis);
convert the structure of dataset from dataframe to transactions.
source("config.R")
admissions <- redivis::user("graph_ai")$
dataset("mimic_iii_clinical_data:9rva:next")$
table("admissions:k1ek")$
to_tibble()
ad_df <- admissions %>%
janitor::clean_names() %>%
filter(diagnosis != "NEWBORN") %>%
mutate(
admittime = as.Date(admittime)
) %>%
arrange(subject_id, admittime) %>%
select(subject_id, admittime, diagnosis) %>%
separate_rows(diagnosis, sep = ";") %>%
filter(diagnosis != "") %>%
filter(diagnosis != " ")
ad_count <- ad_df %>%
group_by(subject_id) %>%
summarise(n = n()) %>%
arrange(desc(n))
ad_long_df <- ad_df %>%
left_join(ad_count, by = "subject_id") %>%
filter(n > 1) %>%
select(-n, -admittime)
ad_long_df %>% filter(subject_id == 109) %>% head() %>% knitr::kable()
| subject_id | diagnosis |
|---|---|
| 109 | HYPERTENSIVE EMERGENCY |
| 109 | HTN CRISIS |
| 109 | HYPERTENSION |
| 109 | HYPERTENSIVE URGENCY |
| 109 | HYPERTENSIVE URGENCY |
| 109 | HYPERTENSIVE URGENCY |
ad_trans <- transactions(ad_long_df,
format = "long",
cols = c("subject_id", "diagnosis"))
Have a quick view of the transactions:
ad_trans %>% as("data.frame") %>% as_tibble() %>% head() %>% knitr::kable()
| items | transactionID |
|---|---|
| { MINIMALLY INVASIVE RE-DO/SDA,AORTIC INSUFFICIENCYVALVE REPLACEMENT} | 100 |
| {END STAGE KIDNEY DISEASE,END STAGE LIVER DISEASE} | 10000 |
| {ICH,S/P FALL,SUBDURAL HEMATOMA,SEIZURE,C3 FRACTURE} | 10004 |
| { REPAIR ASCENDING AORTIC ANEURYSM/SDA,ASCENDING AORTIC ANEURYSMVALVE REPLACEMENT} | 10007 |
| { MITRAL REGURGITATION,CORONARY ARTERY DISEASEARTERY BYPASS GRAFT WITH MVR ? MITRAL VALVE REPLACEMENT /SDA} | 10027 |
| {SYNCOPE,TELEMETRY} | 10029 |
arules::itemFrequencyPlot(ad_trans,
topN = 20,
main = 'Absolute Item Frequency Plot',
type = "absolute",
ylab = "Item Frequency (Absolute)")
arules::itemFrequencyPlot(ad_trans,
topN = 20,
main = 'Relative Item Frequency Plot',
type = "relative",
ylab = "Item Frequency (Relative)")
rules <- apriori(
ad_trans,
parameter = list(supp = 0.002, conf = 0.2)
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.2 0.1 1 none FALSE TRUE 5 0.002 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 24
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[8239 item(s), 12244 transaction(s)] done [0.01s].
## sorting and recoding items ... [147 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [36 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
as(rules, "data.frame") %>% as_tibble() %>%
select(rules, support, confidence, lift, count) %>%
arrange(desc(confidence)) %>%
head(10) %>%
knitr::kable()
| rules | support | confidence | lift | count |
|---|---|---|---|---|
| {TELEMETRY,TRANSIENT ISCHEMIC ATTACK} => {STROKE} | 0.0279320 | 1.0000000 | 23.728682 | 342 |
| {STROKE,TRANSIENT ISCHEMIC ATTACK} => {TELEMETRY} | 0.0279320 | 0.9606742 | 7.198589 | 342 |
| {TRANSIENT ISCHEMIC ATTACK} => {STROKE} | 0.0290755 | 0.9468085 | 22.466518 | 356 |
| {TRANSIENT ISCHEMIC ATTACK} => {TELEMETRY} | 0.0279320 | 0.9095745 | 6.815685 | 342 |
| {CHRONIC OBST PULM DISEASE,COPD EXACERBATION} => {ASTHMA} | 0.0022052 | 0.8709677 | 45.186987 | 27 |
| {STROKE} => {TELEMETRY} | 0.0366710 | 0.8701550 | 6.520305 | 449 |
| {PYELONEPHRITIS} => {URINARY TRACT INFECTION} | 0.0109441 | 0.8535032 | 37.863380 | 134 |
| {COPD EXACERBATION,PNEUMONIA} => {ASTHMA} | 0.0029402 | 0.8372093 | 43.435554 | 36 |
| {COPD EXACERBATION} => {ASTHMA} | 0.0138027 | 0.8284314 | 42.980143 | 169 |
| {ASTHMA,PNEUMONIA} => {COPD EXACERBATION} | 0.0029402 | 0.8000000 | 48.015686 | 36 |
plot(rules, method = NULL, measure = "support", shading = "lift",
interactive = TRUE,
engine = "htmlwidget")
topsubRules <- head(rules, n = 20, by = "confidence")
plot(topsubRules, method = "graph", engine = "htmlwidget")
plot(topsubRules, method="paracoord")
Sequential Pattern Mining is a topic of Data Mining concerned with finding statistically relevant patterns between data examples where the values are delivered in a sequence.
The sequence cares about the order of events but do not need not a notion of time.
Common algorithms for sequential pattern mining:
GSP (Generalized Sequential Pattern Mining)
SPADE (Sequential Pattern Discovery using Equivalence Class): A vertical format sequential pattern mining method which identifies each element in each sequence in a dataset with a Sequence ID (SID) and the Element ID (EID)
Non-Apriori Based Algorithms
PrefixSpan (Prefix-projected Sequential Pattern Mining)
Concept:
Sequence: An ordered list of itemsets.
Event: The element of a sequence.
Item: The element of an event.
Target structure of the dataframe which can be converted to class of transactions:
items: The diagnosis within one admission
transactionID: The identifier of transaction
sequenceID: Patient identifier
eventID: Event identifier
classID (optional): class information of patients
ad_df <- admissions %>%
janitor::clean_names() %>%
mutate(
admittime = as.Date(admittime),
trans_id = row_number()
) %>%
arrange(subject_id, admittime) %>%
select(subject_id, diagnosis, trans_id) %>%
group_by(subject_id) %>%
mutate(
event_id = row_number()
) %>%
ungroup() %>%
separate_rows(diagnosis, sep = ";") %>%
filter(diagnosis != "") %>%
filter(diagnosis != " ") %>%
group_by(subject_id, trans_id, event_id) %>%
summarise(items = list(diagnosis))
ad_count <- ad_df %>%
group_by(subject_id) %>%
summarise(n = n()) %>%
arrange(desc(n))
ad_long_df <- ad_df %>%
left_join(ad_count, by = "subject_id") %>%
filter(n > 1) %>%
select(-n) %>%
arrange(subject_id, event_id)
item_list = as.list(ad_long_df$items)
names(item_list) = c(1:length(item_list))
ad_trans <- as(item_list, "transactions")
transactionInfo(ad_trans)$sequenceID <- ad_long_df$subject_id
transactionInfo(ad_trans)$eventID = ad_long_df$event_id
ad_trans %>% as("data.frame") %>% as_tibble() %>% head(20) %>% knitr::kable()
| items | transactionID | sequenceID | eventID |
|---|---|---|---|
| {PATIENT FORAMEN OVALE PATENT FORAMEN OVALE MINIMALLY INVASIVE/SDA} | 1 | 17 | 1 |
| {PERICARDIAL EFFUSION} | 2 | 17 | 2 |
| {CONGESTIVE HEART FAILURE} | 3 | 21 | 1 |
| {SEPSIS} | 4 | 21 | 2 |
| {CORONARY ARTERY DISEASEARTERY BYPASS GRAFT/SDA} | 5 | 23 | 1 |
| {BRAIN MASS} | 6 | 23 | 2 |
| {CHEST PAIN} | 7 | 34 | 1 |
| {BRADYCARDIA} | 8 | 34 | 2 |
| {CORONARY ARTERY DISEASEARTERY BYPASS GRAFT /SDA} | 9 | 36 | 1 |
| {CHEST PAIN/SHORTNESS OF BREATH} | 10 | 36 | 2 |
| {VENTRAL HERNIA/SDA} | 11 | 36 | 3 |
| {NON-HODGKIN’S LYMPHOMIAMARROW TRANSPLANT} | 12 | 61 | 1 |
| {FEBRILE,NEUTROPENIA,NON-HODGKINS LYMPHOMA} | 13 | 61 | 2 |
| {INCISIONAL HERNIA} | 14 | 67 | 1 |
| {SUBARACHNOID HEMORRHAGE} | 15 | 67 | 2 |
| {PNEUMONIA} | 16 | 68 | 1 |
| {WEAKNESS} | 17 | 68 | 2 |
| {MEDIAL PARIETAL TUMOR/SDA} | 18 | 84 | 1 |
| {GLIOBLASTOMA,NAUSEA} | 19 | 84 | 2 |
| {AORTIC STENOSISCATH} | 20 | 85 | 1 |
arules::itemFrequencyPlot(ad_trans,
topN = 20,
main = 'Absolute Item Frequency Plot',
type = "absolute",
ylab = "Item Frequency (Absolute)")
arules::itemFrequencyPlot(ad_trans,
topN = 20,
main = 'Relative Item Frequency Plot',
type = "relative",
ylab = "Item Frequency (Relative)")
itemsets <- cspade(ad_trans,
parameter = list(support = 0.001),
control = list(verbose = FALSE))
# as(itemsets, "data.frame") %>% as_tibble() %>%
# arrange(desc(support)) %>%
# slice_max(support, n = 20) %>%
# ggplot(aes(x = fct_reorder(sequence, support),
# y = support,
# fill = sequence)) +
# geom_col() +
# geom_label(aes(label = support %>% scales::percent()), hjust = 0.5) +
# labs(
# x = "Diagnosis",
# y = "Support",
# title = "Diagnosis Frequency") +
# scale_fill_discrete(guide = F) +
# scale_y_continuous(labels = scales::percent,
# expand = expansion(mult = c(0, .1))) +
# coord_flip() +
# cowplot::theme_cowplot() +
# theme(
# axis.text=element_text(size=6),
# plot.caption = element_markdown(hjust = 0),
# plot.caption.position = 'plot',
# plot.title.position = 'plot'
# )
summary(itemsets)
## set of 674 sequences with
##
## most frequent items:
## PNEUMONIA ASTHMA TELEMETRY
## 86 71 67
## CONGESTIVE HEART FAILURE COPD EXACERBATION (Other)
## 60 58 730
##
## most frequent elements:
## {PNEUMONIA} {CONGESTIVE HEART FAILURE}
## 80 52
## {SEPSIS} {ASTHMA}
## 50 47
## {TELEMETRY} (Other)
## 41 745
##
## element (sequence) size distribution:
## sizes
## 1 2 3 4 5
## 310 299 62 2 1
##
## sequence length distribution:
## lengths
## 1 2 3 4 5
## 270 306 74 20 4
##
## summary of quality measures:
## support
## Min. :0.001063
## 1st Qu.:0.001196
## Median :0.001728
## Mean :0.004203
## 3rd Qu.:0.003588
## Max. :0.113090
##
## includes transaction ID lists: FALSE
##
## mining info:
## data ntransactions nsequences support
## ad_trans 19958 7525 0.001
rules <- ruleInduction(itemsets,
confidence = 0.1,
control = list(verbose = FALSE))
# inspect(head(rules))
as(rules, "data.frame") %>% as_tibble() %>%
arrange(desc(confidence)) %>%
head() %>%
knitr::kable()
| rule | support | confidence | lift |
|---|---|---|---|
| <{DIABETIC KETOACIDOSIS,TELEMETRY}> => <{DIABETIC KETOACIDOSIS}> | 0.0010631 | 0.8000000 | 44.26471 |
| <{COPD EXACERBATION},{ASTHMA,CHRONIC OBST PULM DISEASE}> => <{ASTHMA}> | 0.0013289 | 0.6666667 | 33.44444 |
| <{HYPERGLYCEMIA},{DIABETIC KETOACIDOSIS}> => <{DIABETIC KETOACIDOSIS}> | 0.0010631 | 0.6666667 | 36.88725 |
| <{ASTHMA,COPD EXACERBATION},{ASTHMA,CHRONIC OBST PULM DISEASE}> => <{ASTHMA}> | 0.0010631 | 0.6666667 | 33.44444 |
| <{COPD EXACERBATION},{CHRONIC OBST PULM DISEASE}> => <{ASTHMA}> | 0.0017276 | 0.6190476 | 31.05556 |
| <{COPD EXACERBATION},{ASTHMA,CHRONIC OBST PULM DISEASE}> => <{COPD EXACERBATION}> | 0.0011960 | 0.6000000 | 33.69403 |
ad_df <- admissions %>%
janitor::clean_names() %>%
# filter(subject_id==109) %>%
filter(diagnosis != "NEWBORN") %>%
mutate(
admittime = as.POSIXct(admittime, format = "%Y-%m-%d %H:%M:%S")
) %>%
arrange(subject_id, admittime) %>%
select(subject_id, admittime, diagnosis) %>%
separate_rows(diagnosis, sep = ";") %>%
filter(diagnosis != "") %>%
filter(diagnosis != " ") %>%
group_by(subject_id) %>%
arrange(admittime) %>%
mutate(time_diff = admittime-lag(admittime),
new_segment = if_else(is.na(time_diff) | time_diff >= 60*60*24*395*100, 1, 0),
event_id = cumsum(new_segment),
) %>%
ungroup() %>%
group_by(subject_id, event_id) %>%
summarise(
items = list(diagnosis)
)
ad_count <- ad_df %>%
group_by(subject_id) %>%
summarise(n = n()) %>%
arrange(desc(n))
ad_long_df <- ad_df %>%
left_join(ad_count, by = "subject_id") %>%
filter(n > 1) %>%
select(-n)
ad_long_df %>% head(10) %>% knitr::kable()
item_list <- as.list(ad_long_df$items)
names(item_list) = c(1:length(item_list))
ad_trans <- as(item_list, "transactions")
transactionInfo(ad_trans)$sequenceID <- ad_long_df$subject_id
transactionInfo(ad_trans)$eventID = ad_long_df$event_id
ad_trans_df <- ad_trans %>% as("data.frame") %>% as_tibble()
head(ad_trans_df, 10) %>% knitr::kable()
itemsets <- cspade(ad_trans,
parameter = list(support = 0.001),
control = list(verbose = FALSE))
as(itemsets, "data.frame") %>% as_tibble() %>%
arrange(desc(support)) %>%
slice_max(support, n = 20) %>%
ggplot(aes(x = fct_reorder(sequence, support),
y = support,
fill = sequence)) +
geom_col() +
geom_label(aes(label = support %>% scales::percent()), hjust = 0.5) +
labs(
x = "Diagnosis",
y = "Support",
title = "Diagnosis Frequency") +
scale_fill_discrete(guide = F) +
scale_y_continuous(labels = scales::percent,
expand = expansion(mult = c(0, .1))) +
coord_flip() +
cowplot::theme_cowplot() +
theme(
axis.text=element_text(size=6),
plot.caption = element_markdown(hjust = 0),
plot.caption.position = 'plot',
plot.title.position = 'plot'
)
summary(itemsets)
rules <- ruleInduction(itemsets,
confidence = 0.1,
control = list(verbose = FALSE))
# inspect(head(rules))
as(rules, "data.frame") %>% as_tibble() %>%
arrange(desc(support)) %>%
head(20) %>%
knitr::kable()